
/*
********************************************************************************
Description:		Import and clean CQI enquiries dataset

Uses:				"${CQIindata}\cqi_enquiry_open_anon_new.csv"
					"${CQIindata}\cqi_enquiry_anon.csv"
					"${CQIindata}\cqi_rejected_anon.csv"

Saves:				"${CQIoutdata}\CQI_enquiry_all_raw.dta"
					"${CQIoutdata}\CQI_enquiry_incometax.dta"
					
********************************************************************************
*/


*************************************************************
* cleanCQIEnquiries: import and clean CQI Enquiries dataset *
*************************************************************

* Typical usage
	* cleanCQIEnquiries, saving("${CQIoutdata}\CQI_enquiry_incometax.dta")
	
capture program drop cleanCQIEnquiries
program define cleanCQIEnquiries

	syntax, saving(string)

	importCQIEnquiriesRaw, saving("${CQIoutdata}\CQI_enquiry_all_raw.dta")
	tidyCQIEnquiries using "${CQIoutdata}\CQI_enquiry_all_raw.dta", saving("`saving'")

end





***************************************************************************
* importCQIEnquiriesRaw: Import raw dataset, do minimal cleaning and save *
***************************************************************************

* Example usage:
	* importCQIEnquiriesRaw, saving("${CQIoutdata}\CQI_enquiry_all_raw.dta")

capture program drop importCQIEnquiriesRaw
capture program define importCQIEnquiriesRaw


	syntax, saving(string)


	* Import the data
	*****************

	tempfile datafile


	* Open CQI enquiries
	*-------------------

	qui import delimited using "${CQIindata}\cqi_enquiry_open_anon_new.csv", clear

	* Mark as open enquiries
	gen str1 status_ind = "O"

	qui save `datafile'


	* Closed CQI enquiries
	*---------------------

	qui import delimited using "${CQIindata}\cqi_enquiry_anon.csv", clear
	append using `datafile'
	qui save `datafile', replace
	

	* Rejected cases
	*---------------

	qui import delimited using "${CQIindata}\cqi_rejected_anon_add_var.csv", clear

	* Deceased variable isn't in cqi_enquiry_anon.csv so drop it (also, we identify rejection reason from rejection_code)
	drop deceased
	
	* Identify IT and CT cases
	assert (rejection_code < .)
	gen str2 orig_system_ind = cond(rejection_code < 1100, "IT", "CT")

	rename txp_type taxpayer_type_ind
	
	tempfile RejectedCases
	save `RejectedCases',replace
	
	*Open dataset that contains all open and closed enquiries
	use `datafile',clear
	
	*Merge rejected cases with all open and closed enquiries 
	qui merge 1:1 utr_anon tax_yearap_date using `RejectedCases'
	
	* drop case that does not merge (i.e. where we only have information on the rejection code, but not whether it is a targeted or random audit - will later get this info from randomslist_txptype_anon.csv)
	qui drop if _merge==2 


	* Tidy the data
	***************

	* UTR
	*----

	* Drop case with missing utr_anon
	drop if utr_anon == .

	* Rename for consistent with other data
	rename utr_anon utr_no


	* status_ind
	*-----------

	* Drop case with missing status_ind
	drop if status_ind == ""
	

	* taxpayer_type_ind
	*------------------
	
	* Drop where taxpayer_type_ind of "I" (an unknown code) and where blank
	drop if inlist(taxpayer_type_ind, "I", "")
	
	
	* tax_yearap_date
	*----------------

	* Drop one missing tax_yearap_date
	drop if tax_yearap_date == ""

	* Make tax_yearap_date numeric
	qui gen int tax_year_accountperiod = date(lower(substr(tax_yearap_date,1,7)), "DMY", 2014)
	format %td tax_year_accountperiod
	drop tax_yearap_date
	rename tax_year_accountperiod tax_yearap_date
	label variable tax_yearap_date "Tax year or accounting period end date"

	* Create tax year variable
	run "${dofiles}\taxyear.do"
	taxyear tax_yearap_date, gen(tax_year)
	label variable tax_year "Tax year (or tax year of account period end)"	
	
	* More date variables
	*--------------------

	local datevarlist "enqry_start_date settlement_date enqry_closure_date lastfor_enqry_date"
	tempvar tempdatevar
	foreach datevar of local datevarlist {
		gen `tempdatevar' = `datevar'
		drop `datevar'
		qui gen int `datevar' = date(lower(substr(`tempdatevar',1,7)), "DMY", 2014)
		format %td `datevar'
		drop `tempdatevar'
	}


	* Sort and order
	*---------------

	*sort and order the data
	order utr_no tax_year
	sort utr_no tax_year


	* Save the data
	***************

	* Save full raw dataset
	compress
	save "`saving'", replace

end





*************************************************************************************************
* tidyCQIEnquiries: drop superfluous variables, keep income tax observations and tidy things up *
*************************************************************************************************

* Example usage:
	* tidyCQIEnquiries using "${CQIoutdata}\CQI_enquiry_all_raw.dta", saving("${CQIoutdata}\CQI_enquiry_incometax.dta")
	
capture program drop tidyCQIEnquiries
program define tidyCQIEnquiries


	syntax using/, saving(string)
	
	* Open data
	use "`using'"
	

	* Drop variables we don't need
	# delimit ;
	drop multipl_reason_sig deletion_date 
		recovry_potial_ind segment_type_ind failure_notify_sig business_type_desc group_sig 
		pship_review_sig tcn_code setlmt_cancel_date type_changed_date hinwi_case_sig 
		taxcred_claim_sig tpi_yield_sig enqrylink_type_ind sic_code sig_in 
		;
	# delimit cr


	* Keep Income tax observations
	keep if orig_system_ind == "IT"


	* Sort the data
	sort tax_year utr_no


	* Date of last interaction with HMRC
	egen long last_interaction = rowmax(settlement_date enqry_closure_date)
	format %td last_interaction
	label var last_interaction "date of last interaction"

	* Length of audit in days
	gen long audit_length = last_interaction - enqry_start_date
	label var audit_length "length of audit (enquiry open to last interaction), in days"

	* Time to deadline when audit begins
	gen long audit_ttd = lastfor_enqry_date - enqry_start_date 
	label var audit_ttd "time to deadline for audit to start, in days"

	* Tax year in which the last interaction took place
	taxyear last_interaction, gen(year_last_interaction)
	label var year_last_interaction "tax year of last interaction"

	* Tax year in which the first interaction took place
	taxyear enqry_start_date, gen(year_first_interaction)
	label var year_first_interaction "tax year of first interaction"


	* Save data
	compress
	sort tax_year utr_no
	save "`saving'", replace



end
